1 RNA-seq of muscles

We use files from SYSTEMS BIOLOGY NII, IMBP, 2nd RNA-seq, ok quality. (First RNA-seq failed because of EuroMint polymerase usage.) Fastq files there: /data7a/bio/human_genomics/muscle/imbp_muscles_generozov/

During sequencing, the samples were divided into four lanes to avoid cell fallout. This would result in bias, with rare transcripts underrepresented and common ones overrepresented. We don’t analyze this bias. Coverage is OK, and they also include exome and methylation information. Reads was not trimmed. Fastq read from 4 lanes was concatenated in 1 total file and quantified by Salmon.

metadata <- read.csv("/data7a/bio/human_genomics/muscle/imbp_muscles_generozov/meta_imbp_muscles.csv")
tx2gene <- read.csv("/data7a/bio/human_genomics/muscle/analysis/timur_files/itog/tx2gene.csv")
met <- read.csv("/data7a/bio/human_genomics/muscle/analysis/timur_files/itog/samples.csv", stringsAsFactors = F) 
met$Condition <- as.factor(met$Condition)
met$Person <- as.character(met$Person)
rownames(met) <- met$Sample

file_n <- setNames(met$Filepath, met$Sample)

Expression matrix

txi <- tximport(
  file_n,
  type = "salmon",
  tx2gene = tx2gene,
  ignoreTxVersion = TRUE,
  countsFromAbundance = "no")
saveRDS(txi, '/data7a/bio/human_genomics/muscle/analysis/RDS/txi.RDS')

Lets check values’ distribution. Попробуем разделить денсити по образцам. We observe matrix in base state, not in log state.

2 PCA plots

Lets plot pca to check bias.

log_pca <- log10(tpm_matrix + 1)
pca <- pca(log_pca, metadata = met, removeVar = 0.1)
screeplot(pca, components = getComponents(pca, seq_len(6)))

col_by <- "Condition"
biplot(
  pca, x = "PC1", y = "PC2", pointSize = 3, colby = col_by,
  legendPosition="right", showLoadings = F)

Findings from PCA:

  1. There is an artifact sample 9_B1, it needs to be counted with and without it.
  2. You can try to identify the tissue of 9_B1.
  3. Points B1 and B2 are divided into groups, but half of the B2 points are mixed into the B1 group. This indicates either strong individual differences (e.g., different muscle growth rates—this requires calibrating the measurement instruments) or a technical error (missing the target and taking the wrong tissue, poor rRNA depletion, insufficient sample size).

3 Differential Expression

Lets count with and without 9B outliers samples.

dds <- DESeqDataSetFromTximport(
  txi,
  colData = met,
  design = ~ Person + Condition)

keep <- rowSums(counts(dds) >= 10) >= ncol(dds)/2
dds <- dds[keep, ]

dds <- DESeq(dds)
res_all <- results(dds, contrast = c("Condition", "Trained", "Untrained"))

# Фильтрация значимых генов
sig_genes_all <- as.data.frame(res_all)[!is.na(res_all$padj) & res_all$padj < 0.05 & abs(res_all$log2FoldChange) > 1, ]
# add symbols
sig_genes_all$symbol <- mapIds(org.Hs.eg.db, keys = rownames(sig_genes_all),
column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first") 
sig_genes_all <- sig_genes_all[order(sig_genes_all$padj), ]

Try without weird sample.

met_filt <- met[met$Sample != "9_B1", ] 
samples_to_keep <- met_filt$Sample

txi_filt <- txi
txi_filt$counts    <- txi$counts[, samples_to_keep]
txi_filt$abundance <- txi$abundance[, samples_to_keep]
txi_filt$length    <- txi$length[, samples_to_keep]
rm(samples_to_keep)

dds2 <- DESeqDataSetFromTximport(
  txi_filt,
  colData = met_filt,
  design = ~ Person + Condition)
keep <- rowSums(counts(dds2) >= 10) >= ncol(dds2)/2
dds2 <- dds2[keep, ]

dds2 <- DESeq(dds2, parallel = T)
res_nooutl <- results(dds2, contrast = c("Condition", "Trained", "Untrained"))

# Фильтрация значимых генов
sig_genes_nooutl <- as.data.frame(res_nooutl)[!is.na(res_nooutl$padj) & res_nooutl$padj < 0.05 & abs(res_nooutl$log2FoldChange) > 1, ]
# add symbols
sig_genes_nooutl$symbol <- mapIds(org.Hs.eg.db, keys = rownames(sig_genes_nooutl),
column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first") 
sig_genes_nooutl <- sig_genes_nooutl[order(sig_genes_nooutl$padj), ]
write_csv(sig_genes_nooutl, '/data7a/bio/human_genomics/muscle/analysis/download/DEGs_nooutl.csv')

Check intersection between outliers filtered and unfiltered.

v <- list(Filtered = rownames(sig_genes_nooutl), `All samples` = rownames(sig_genes_all))
ggVennDiagram(v) + scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") + coord_flip()

3.1 Compare two ways

print("Without outlier")
## [1] "Without outlier"
summary(res_nooutl)
## 
## out of 14528 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 979, 6.7%
## LFC < 0 (down)     : 592, 4.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("All samples")
## [1] "All samples"
summary(res_all)
## 
## out of 14582 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 868, 6%
## LFC < 0 (down)     : 471, 3.2%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

The DEGs number increased, not decreased. If the outlier carried a biological signal, there would have been fewer of them. Here, it’s the opposite—the outlier smeared the variance, weakened statistical power, and obscured the effects between groups.

MA-plot — is a diagram in which each point is a gene.

  • A (average expression) — average gene expression level (log scale)

  • M (minus / fold change) — logarithm of the expression ratio between two conditions (log2FoldChange)

    A = 0.5 * log2(baseMean) M = log2FoldChange(sample2 / sample1)

plotMA(res_nooutl, ylim = c(-5, 5), main = "MA-plot Filtered")

plotMA(res_all, ylim = c(-5, 5), main = "MA-plot Unfiltered")

We need to look at the cloud’s displacement to see if the normalization is correct.

Conclusion from the MA plot: the Filtered values ​​are more closely clustered, meaning the dispersion has decreased, meaning the outlier was indeed noisy.

Dispersion estimates for filtered and unfilt.

plotDispEsts(dds, main = "Dispersion estimates unf")

plotDispEsts(dds2, main = "Dispersion estimates flt")

Lets check correlation between all FC in all and no_outl groups.

## [1] 0.9478472

Correlation is almost the same, no major difference.

3.1.1 Volcano plot and DEGs summary

3.1.2 Boxplots for DEGs

3.1.3 Comparison with IMBP precomputed DEG’s list

People from the Institute of Medical Problems calculated before us and gave us a ready-made table with DEGs; we need to compare our results with it.

deg_imbp <- read_excel('/data7a/bio/human_genomics/muscle/analysis/upload/imbp_results_DEGs_filtered.xlsx')
sig_genes_imbp <- deg_imbp %>% filter( abs(log2FoldChange) > 1, padj <= 0.05)

v <- list(`My DEGs` = sig_genes_nooutl$symbol, IMBP = sig_genes_imbp$gene_name)
ggVennDiagram(v) + scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") + coord_flip()

v <- list(`My DEGs ensg` = rownames(sig_genes_nooutl), IMBP = sig_genes_imbp$gene_id)
ggVennDiagram(v) + scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") + coord_flip()

v <- list(`My DEGs` = sig_genes_all$symbol, IMBP = sig_genes_imbp$gene_name)
ggVennDiagram(v) + scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") + coord_flip()

A <- sig_genes_nooutl$symbol
B <- sig_genes_imbp$gene_name
only_A <- setdiff(A, B)
only_B <- setdiff(B, A)
both_AB <- intersect(A, B)


cat("Only in My DEGs (A):\n", paste(only_A, collapse = ", "), "\n\n")
## Only in My DEGs (A):
##  EDNRB, DOC2B, PTGFRN, COL4A1, LYN, MKI67, PDE10A, NA, HSPA6, VDR, TRAF4, HSPA7, MSR1, FAP, SLC4A3, CHST1, ARHGAP4, COL4A2, TANC2, PNCK, ARMCX2, VSIG4, CASTOR1, KCNJ3, ARL4C, PRKY, LOXHD1, SNCA, MBOAT2, FRMD4A, MYLK, LAD1, PLEKHA4, ADAM28, TAPBP, HTR7, LOXL1, PAG1, FGD2, FERMT3, MYOF, OXER1, PRSS36, DOCK7, TYROBP, NCAPD3, HGF, DOCK10, MMP16, MARCHF1, EPS15, CD48, SERPINE1, NHSL1, OVOS2, LRRC70, ENTPD7, LRRC17, EEF2K, F2RL3, IGSF10, P2RY13, IRF5, HDC, ACTG2, IL10RA, PLD1, CFI, ADGRB2, JAML, POLR2A, GBP5, GALNT5, NSG1, PRXL2B, CLIC1, KCNB1, ARHGAP9, TNFRSF4, TOP2A, FGD3, LOC728488, TDRD6, PRR5L, ADAMTS6, RBBP8, MS4A14, BEND6, APOL5, TBC1D10C, GPR183, GPR173, LOC100129534, PKHD1L1, ITGAM, ILDR2, ACTC1
cat("Only in IMBP (B):\n", paste(only_B, collapse = ", "), "\n\n")
## Only in IMBP (B):
##  FMO1, ANLN, IGF1, ELN, COL11A1, FCGR2B, SEMA5B, ADAMTS2, MMP2, CFAP61, HEPH, KDELR3, ACP5, SFRP4, OGN, MS4A6A, MS4A4A, TP53I3, PLP1, TNFAIP6, F13A1, EVI2A, TUBA4A, CLEC10A, LOXL2, PCNX2, SLC41A2, AOAH, SLCO1C1, CYP4B1, FCGR2A, CTSK, VASH2, SUSD4, LEFTY2, TMEFF2, MYLK4, LPAR4, PAMR1, MS4A2, KIAA1755, PIEZO2, C1QC, OLFML2B, FSTL1, EDIL3, ESM1, CTHRC1, HTRA1, FBN1, C2, CERCAM, TOP6BL, HSD11B2, SHISA3, FCER1A, FOXS1, UTS2R, TMEM45A, EPHB3, THBS2, MYMK, SAMD11, FAM180B, MPEG1, DPP4, DIO2, MS4A4E, MKRN2OS, AKR1B15, FCGR2C, MILR1, CCL14
cat("In both lists:\n", paste(both_AB, collapse = ", "), "\n")
## In both lists:
##  PLIN5, CCDC80, MXRA5, S100A4, TRO, CALML6, SPARC, MMP19, FNDC1, COL3A1, LOX, COL6A6, COL5A1, PRND, ASPN, PENK, CPXM1, COL1A1, COL5A2, TPSAB1, MAP1A, MS4A7, THY1, C1QTNF3, FPR3, NEU4, NRK, RNASE6, PLCB2, CPA4, PRG4, BGN, COL1A2, ST8SIA2, TMEM63C, ARHGAP36, MFAP2, DNM1, SH3PXD2B, SFRP2, MEST, TMEM119, SPON1, CAPN6, THBS4, COL14A1, MYL4, AHNAK2, CTSG, MYL5, BASP1, CD1C, CCN3, CD86, CCN4, CD24, PRUNE2, PLPPR4, CPA3, COL8A2, CD163L1, EDA2R, CRISPLD1, POSTN, PTK7
tibble(
  Group = c("Only in My DEGs", "Only in IMBP", "In both lists", 'IMBP total', 'My DEGs total'),
  Count = c(length(only_A), length(only_B), length(both_AB), length(B), length(A))
) %>% DT::datatable(caption = "My DEGs vs IMBP comparison")
rm(A,B, only_A, only_B, both_AB)

3.1.3.1 Compare ours and their LFC.

for_plot <- as.data.frame(res_nooutl)
for_plot$symbol <- mapIds(org.Hs.eg.db, keys = rownames(for_plot),
column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first") 

for_plot <- for_plot %>% left_join(deg_imbp, by=c("symbol"='gene_name')) %>% 
  dplyr::rename(Our_LFC = log2FoldChange.x,
imbp_LFC = log2FoldChange.y)

ggplot(for_plot, aes(x=Our_LFC, y=imbp_LFC)) +
  geom_point()

LFC correlate with each other, which proves that we analyzed data in a similar way.

4 GO enrichment

Lets make an enrichment for filtered DEGs.

Before the annotation, lets perform a gene enrichment analysis in Gene Ontology (GO) separately for three main categories (namespaces):

  1. Molecular Function (MF). Describes the specific biochemical activity of a gene product (protein or RNA) at the molecular level. Examples: catalytic activity, ion binding, transport activity, DNA binding function. Essence: what exactly the molecule can do in a biochemical sense.
  2. Biological Process (BP). Characterizes a coordinated series of molecular events leading to a result significant for the organism. Examples: cell cycle, glucose metabolism, immune response, apoptosis, nervous system development. Essence: what integral biological phenomenon is ensured by the work of the gene/protein.
  3. Cellular Component (CC). Determines the subcellular localization or macromolecular complex where the gene product functions.

pAdjustMethod = “fdr”, pvalueCutoff = 0.01

4.0.1 MF enrich

go_mf <- enrichGO(sig_genes_nooutl$symbol, OrgDb="org.Hs.eg.db", ont="MF", keyType= "SYMBOL", pAdjustMethod = "fdr", pvalueCutoff  = 0.01, qvalueCutoff  = 0.05)
go_mf <- go_mf@result %>% arrange(p.adjust) %>% head(20) 
DT::datatable(go_mf, caption = "GO MF enrich")
write_csv(go_mf, '/data7a/bio/human_genomics/muscle/analysis/download/go_mf.csv')

4.0.2 BP enrich

go_bp_a <- enrichGO(sig_genes_nooutl$symbol, OrgDb="org.Hs.eg.db", ont="BP", keyType       = "SYMBOL", pAdjustMethod = "fdr", pvalueCutoff  = 0.01, qvalueCutoff  = 0.05)
go_bp <- go_bp_a@result %>% arrange(p.adjust) %>% head(20)
DT::datatable(go_bp, caption = "GO BP enrich")
write_csv(go_bp, '/data7a/bio/human_genomics/muscle/analysis/download/go_bp.csv')

4.0.3 CC enrich

go_cc <- enrichGO(sig_genes_nooutl$symbol, OrgDb="org.Hs.eg.db", ont="CC", keyType       = "SYMBOL", pAdjustMethod = "fdr", pvalueCutoff  = 0.01, qvalueCutoff  = 0.05)
go_cc <- go_cc@result %>% arrange(p.adjust)%>% head(20)
DT::datatable(go_cc, caption = "GO CC enrich")
write_csv(go_cc, '/data7a/bio/human_genomics/muscle/analysis/download/go_cc.csv')

4.0.4 Other

4.0.4.1 cnetplot bp

barplot(go_bp_a, showCategory = 20) + ggtitle("GO enrichment filtered")

#cnetplot(go, foldChange = NULL)
cnetplot(go_bp_a, foldChange = sig_genes_nooutl$log2FoldChange)

4.0.4.2 compare GO: All samples vs No outliers.

genes_list <- list(
  NoOutlier = sig_genes_nooutl$symbol,
  AllSamples = sig_genes_all$symbol
)
cc <- compareCluster(
  geneClusters = genes_list,
  fun = "enrichGO",
  OrgDb = org.Hs.eg.db,
  ont = "BP",
  keyType = "SYMBOL",
  pvalueCutoff = 0.01,
  qvalueCutoff = 0.05,
  pAdjustMethod = "BH"
)
dotplot(cc, showCategory = 20) + 
  ggtitle("GO NoOutlier vs AllSamples")

as.data.frame(cc) %>% arrange(p.adjust) %>% head(20) %>% DT::datatable()